import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
np.set_printoptions(linewidth=100)
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris, fetch_lfw_people, load_breast_cancer
from sklearn.model_selection import train_test_split
def pca(A,n_components=2):
'''
Perform principal component analysis on a matrix A and
return the desired principal components
'''
U,S,Vh = np.linalg.svd(A, full_matrices=False)
return A@Vh[:,:n_components]
# import data
# iris
iris = load_iris()
iris_X_train, iris_X_test, iris_y_train, iris_y_test = train_test_split(
iris.data, iris.target, test_size=.5
)
# breast cancer
can = load_breast_cancer()
can_X_train, can_X_test, can_y_train, can_y_test = train_test_split(
can.data, can.target, test_size=.5
)
# faces
peo = fetch_lfw_people()
peo_X_train, peo_X_test, peo_y_train, peo_y_test = train_test_split(peo.data, peo.target, test_size=.7)
def plot_data(X, y, classes, title):
for c in range(classes):
mask = y == c
plt.scatter(X[mask, 0], X[mask, 1], label=f'type {c}',marker='.',lw=.3)
plt.legend()
plt.title(title)
plt.savefig(f'./images/{title}.pdf')
plt.show()
# give background on how well PCA works for each data set
trans_iris_train = pca(iris_X_train)
trans_iris_test = pca(iris_X_test)
c1 = iris_y_train == 0
c2 = iris_y_train == 1
c3 = iris_y_train == 2
plt.scatter(trans_iris_train[c1,0], trans_iris_train[c1,1], label='type1')
plt.scatter(trans_iris_train[c2,0], trans_iris_train[c2,1], label='type2')
plt.scatter(trans_iris_train[c3,0], trans_iris_train[c3,1], label='type3')
plt.title('Natural iris data PCA, no noise')
plt.savefig('Unaltered_iris.pdf')
plt.legend()
plt.show()
RFC = RandomForestClassifier(n_estimators=50, max_depth=10)
RFC = RFC.fit(trans_iris_train, iris_y_train)
score = sum(RFC.predict(trans_iris_test) == iris_y_test) / len(iris_y_test)
print(f'Iris + random forest baseline = {score}')
KNC = KNeighborsClassifier(n_neighbors=3)
KNC.fit(trans_iris_train, iris_y_train)
score = sum(KNC.predict(trans_iris_test) == iris_y_test) / len(iris_y_test)
print(f'Iris + K-Neighbors baseline = {score}')
trans_can_train = pca(can_X_train)
trans_can_test = pca(can_X_test)
c1 = can_y_train == 0
c2 = can_y_train == 1
plt.scatter(trans_can_train[c1,0], trans_can_train[c1,1], label='type1',marker='.',lw=.3)
plt.scatter(trans_can_train[c2,0], trans_can_train[c2,1], label='type2',marker='.',lw=.3)
plt.title('Natural cancer data PCA, no noise')
plt.legend()
plt.savefig('Unaltered_cancer.pdf')
plt.show()
RFC = RandomForestClassifier(**{'n_estimators':20, 'max_depth':10})
RFC.fit(trans_can_train, can_y_train)
score = sum(RFC.predict(trans_can_test) == can_y_test) / len(can_y_test)
print(f'Cancer + random forest baseline = {score}')
KNC = KNeighborsClassifier(**{'n_neighbors':3})
KNC.fit(trans_can_train, can_y_train)
score = sum(KNC.predict(trans_can_test) == can_y_test) / len(can_y_test)
print(f'Cancer + K-Neighbors baseline = {score}')
def noiser(A,noise,params):
'''
Add noise of type `noise` to the data
noise may be:
normal, beta, student-t, uniform,
gamma
'''
noise_dict = {
'normal': stats.norm, 'student-t': stats.t,
'beta': stats.beta, 'gamma': stats.gamma,
'uniform': stats.uniform
}
n_func = noise_dict[noise](*params)
return A + n_func.rvs(A.shape)
# set up the parameters grids I will use to test PCA
beta_params = {
'a': np.linspace(.1,5,15),
'b': np.linspace(.1,5,15)
}
normal_params = {
'mean': np.linspace(-1,1,15),
'std': np.linspace(0,1,15)
}
uniform_params = {
'a': np.linspace(0,2,15),
'length': np.linspace(.5,2,15)
}
student_t_params = {
'df': np.linspace(.2,7,100),
'mean': np.linspace(0,10,50),
'std': np.linspace(0,10,20)
}
gamma_params = {
'a': np.linspace(.1,10,15),
'scale': np.linspace(.1,10,15)
}
def beta_testing(X_train,y_train,X_test,y_test,n,Classifier,params,plot=True):
# do beta testing
as_ = beta_params['a']
bs = beta_params['b']
i=0
scores = np.zeros((len(as_)*len(bs),3))
for a in as_:
for b in bs:
# first we apply noise
X_train_noise = noiser(X_train, 'beta', (a,b))
X_test_noise = noiser(X_test, 'beta', (a,b))
# next we apply pca
train_data = pca(X_train_noise)
test_data = pca(X_test_noise)
# finally we test and train a classifier and examine the
# score
classifier = Classifier(**params)
classifier.fit(train_data, y_train)
# score the classifier and record it
score = sum(classifier.predict(test_data) == y_test) / len(y_test)
scores[i] = (a,b,score)
# every once and a while we plot the results
if i%25 == 0 and plot:
title_format = 'Beta: a={a:.2f}, b={b:.2f}, score={score:.2f}'
title = title_format.format(a=a, b=b, score=scores[i,2])
plot_data(train_data, y_train, n, title)
i+=1
return scores
# test beta with random forest
# get the scores
scores = beta_testing(
can_X_train, can_y_train, can_X_test, can_y_test, 2,
RandomForestClassifier, {'n_estimators':50, 'max_depth':10}
)
print('\nBETA NOISE WITH RANDOM FOREST\n')
# print the best results
best_performance = np.argsort(scores[:,2])[::-1][:10]
print(f'The 10 best performing parameters are :')
print('\ta\tb\tscore')
print('\t'+'--'*len('\ta\tb\tscore'))
row_format = '\t{a:.2f}&\t{b:.2f}&\t{score:.3f}\\\\'
for best in best_performance:
a,b,score = scores[best]
print(row_format.format(a=a,b=b,score=score))
print()
# print the worst results
worst_performance = np.argsort(scores[:,2])[::-1][-10:]
print(f'The 10 worst performing parameters are :')
print('\ta\tb\tscore')
print('\t'+'--'*len('\ta\tb\tscore'))
row_format = '\t{a:.2f}&\t{b:.2f}&\t{score:.3f}\\\\'
for worst in worst_performance:
a,b,score = scores[worst]
print(row_format.format(a=a,b=b,score=score))
# test beta with k-neighbors classifier
scores = beta_testing(
can_X_train, can_y_train, can_X_test, can_y_test, 2,
KNeighborsClassifier, {'n_neighbors':3}, plot=True
)
print('\nBETA NOISE WITH K-NEIGHBORS\n')
# print the best results
best_performance = np.argsort(scores[:,2])[::-1][:10]
print(f'The 10 best performing parameters are :')
print('\ta\tb\tscore')
print('\t'+'--'*len('\ta\tb\tscore'))
row_format = '\t{a:.2f}&\t{b:.2f}&\t{score:.3f}\\\\'
for best in best_performance:
a,b,score = scores[best]
print(row_format.format(a=a,b=b,score=score))
print()
# print the worst results
worst_performance = np.argsort(scores[:,2])[::-1][-10:]
print(f'The 10 worst performing parameters are :')
print('\ta\tb\tscore')
print('\t'+'--'*len('\ta\tb\tscore'))
row_format = '\t{a:.2f}&\t{b:.2f}&\t{score:.3f}\\\\'
for worst in worst_performance:
a,b,score = scores[worst]
print(row_format.format(a=a,b=b,score=score))
# do normal testing
def normal_testing(X_train,y_train,X_test,y_test,n,Classifier,params,plot=True):
means = normal_params['mean']
stds = normal_params['std']
i=0
scores = np.zeros((len(means)*len(stds),3))
for mean in means:
for std in stds:
# first we apply noise
X_train_noise = noiser(X_train, 'normal', (mean,std))
X_test_noise = noiser(X_test, 'normal', (mean,std))
# next we apply pca
train_data = pca(X_train_noise)
test_data = pca(X_test_noise)
# finally we test and train a classifier and examine the
# score
classifier = Classifier(**params)
classifier.fit(train_data, y_train)
# score the classifier and record it
score = sum(classifier.predict(test_data) == y_test) / len(y_test)
scores[i] = (mean,std,score)
# every once and a while we plot the results
if i%25 == 0 and plot:
title_format = 'Normal: $\mu$={a:.2f}, $\sigma$={b:.2f}, score={score:.2f}'
title = title_format.format(a=mean, b=std, score=scores[i,2])
plot_data(train_data, y_train, n, title)
i+=1
return scores
# test normal with random forest
# get the scores
scores = normal_testing(
can_X_train, can_y_train, can_X_test, can_y_test, 2,
RandomForestClassifier, {'n_estimators':50, 'max_depth':10}
)
print('\nNORMAL NOISE WITH RANDOM FOREST\n')
# print the best results
best_performance = np.argsort(scores[:,2])[::-1][:10]
print(f'The 10 best performing parameters are :')
print('\tMean\tStd\tscore')
print('\t'+'--'*len('\ta\tb\tscore'))
row_format = '\t{a:.2f}&\t{b:.2f}&\t{score:.3f}\\\\'
for best in best_performance:
a,b,score = scores[best]
print(row_format.format(a=a,b=b,score=score))
print()
# print the worst results
worst_performance = np.argsort(scores[:,2])[::-1][-10:]
print(f'The 10 worst performing parameters are :')
print('\tMean\tStd\tscore')
print('\t'+'--'*len('\ta\tb\tscore'))
row_format = '\t{a:.2f}&\t{b:.2f}&\t{score:.3f}\\\\'
for worst in worst_performance:
a,b,score = scores[worst]
print(row_format.format(a=a,b=b,score=score))
# test normal with k-neighbors classifier
scores = normal_testing(
can_X_train, can_y_train, can_X_test, can_y_test, 2,
KNeighborsClassifier, {'n_neighbors':3}, plot=True
)
print('\nNORMAL NOISE WITH K-NEIGHBORS\n')
# print the best results
best_performance = np.argsort(scores[:,2])[::-1][:10]
print(f'The 10 best performing parameters are :')
print('\tMean\tStd\tscore')
print('\t'+'--'*len('\ta\tb\tscore'))
row_format = '\t{a:.2f}&\t{b:.2f}&\t{score:.3f}\\\\'
for best in best_performance:
a,b,score = scores[best]
print(row_format.format(a=a,b=b,score=score))
print()
# print the worst results
worst_performance = np.argsort(scores[:,2])[::-1][-10:]
print(f'The 10 worst performing parameters are :')
print('\tMean\tStd\tscore')
print('\t'+'--'*len('\ta\tb\tscore'))
row_format = '\t{a:.2f}&\t{b:.2f}&\t{score:.3f}\\\\'
for worst in worst_performance:
a,b,score = scores[worst]
print(row_format.format(a=a,b=b,score=score))
# do student t testing
def student_t_testing(X_train,y_train,X_test,y_test,n,Classifier,params,plot=True):
dfs = student_t_params['df']
i=0
scores = np.zeros((len(dfs),2))
for df in dfs:
# first we apply noise
X_train_noise = noiser(X_train, 'student-t', (df,))
X_test_noise = noiser(X_test, 'student-t', (df,))
# next we apply pca
train_data = pca(X_train_noise)
test_data = pca(X_test_noise)
# finally we test and train a classifier and examine the
# score
classifier = Classifier(**params)
classifier.fit(train_data, y_train)
# score the classifier and record it
score = sum(classifier.predict(test_data) == y_test) / len(y_test)
scores[i] = (df,score)
# every once and a while we plot the results
if i%25 == 0 and plot:
title_format = 'Student-t: df={a:.2f} score={score:.2f}'
title = title_format.format(a=df, score=scores[i,1])
plot_data(train_data, y_train, n, title)
i+=1
return scores
# test student-t with random forest
# get the scores
scores = student_t_testing(
can_X_train, can_y_train, can_X_test, can_y_test, 2,
RandomForestClassifier, {'n_estimators':50, 'max_depth':10}
)
print('\nSTUDENT-T WITH RANDOM FOREST\n')
# print the best results
best_performance = np.argsort(scores[:,1])[::-1][:10]
print(f'The 10 best performing parameters are :')
print('\tdf\tscore')
print('\t'+'--'*len('\tdf\tscore'))
row_format = '\t{a:.2f}&\t{score:.3f}\\\\'
for best in best_performance:
df,score = scores[best]
print(row_format.format(a=df,score=score))
# print the worst results
worst_performance = np.argsort(scores[:,1])[::-1][-10:]
print(f'The 10 worst performing parameters are :')
print('\tdf\tscore')
print('\t'+'--'*len('\tdf\tscore'))
row_format = '\t{a:.2f}&\t{score:.3f}\\\\'
for worst in worst_performance:
df,score = scores[worst]
print(row_format.format(a=df,score=score))
# test student-t with k-neighbors classifier
scores = student_t_testing(
can_X_train, can_y_train, can_X_test, can_y_test, 2,
KNeighborsClassifier, {'n_neighbors':3}, plot=True
)
print('\nSTUDENT-T WITH K-NEIGHBORS\n')
# print the best results
best_performance = np.argsort(scores[:,1])[::-1][:10]
print(f'The 10 best performing parameters are :')
print('\tdf\tscore')
print('\t'+'--'*len('\tdf\tscore'))
row_format = '\t{a:.2f}&\t{score:.3f}\\\\'
for best in best_performance:
df,score = scores[best]
print(row_format.format(a=df,score=score))
# print the worst results
worst_performance = np.argsort(scores[:,1])[::-1][-10:]
print(f'The 10 worst performing parameters are :')
print('\tdf\tscore')
print('\t'+'--'*len('\tdf\tscore'))
row_format = '\t{a:.2f}&\t{score:.3f}\\\\'
for worst in worst_performance:
df,score = scores[worst]
print(row_format.format(a=df,score=score))
# do uniform testing
def uniform_testing(X_train,y_train,X_test,y_test,n,Classifier,params,plot=True):
as_ = uniform_params['a']
lengths = uniform_params['length']
i=0
scores = np.zeros((len(as_)*len(lengths),3))
for a in as_:
for length in lengths:
b = a + length
# first we apply noise
X_train_noise = noiser(X_train, 'uniform', (a,b))
X_test_noise = noiser(X_test, 'uniform', (a,b))
# next we apply pca
train_data = pca(X_train_noise)
test_data = pca(X_test_noise)
# finally we test and train a classifier and examine the
# score
classifier = Classifier(**params)
classifier.fit(train_data, y_train)
# score the classifier and record it
score = sum(classifier.predict(test_data) == y_test) / len(y_test)
scores[i] = (a,b,score)
# every once and a while we plot the results
if i%25 == 0 and plot:
title_format = 'Uniform: a={a:.2f}, b={b:.2f}, score={score:.2f}'
title = title_format.format(a=a, b=b, score=scores[i,2])
plot_data(train_data, y_train, n, title)
i+=1
return scores
# test uniform with random forest
# get the scores
scores = uniform_testing(
can_X_train, can_y_train, can_X_test, can_y_test, 2,
RandomForestClassifier, {'n_estimators':50, 'max_depth':10}
)
print('\nUNIFORM WITH RANDOM FOREST\n')
# print the best results
best_performance = np.argsort(scores[:,2])[::-1][:10]
print(f'The 10 best performing parameters are :')
print('\ta\tb\tscore')
print('\t'+'--'*len('\ta\tb\tscore'))
row_format = '\t{a:.2f}&\t{b:.2f}&\t{score:.3f}\\\\'
for best in best_performance:
a,b,score = scores[best]
print(row_format.format(a=a,b=b,score=score))
# print the worst results
worst_performance = np.argsort(scores[:,2])[::-1][-10:]
print(f'The 10 worst performing parameters are :')
print('\ta\tb\tscore')
print('\t'+'--'*len('\ta\tb\tscore'))
row_format = '\t{a:.2f}&\t{b:.2f}&\t{score:.3f}\\\\'
for worst in worst_performance:
a,b,score = scores[worst]
print(row_format.format(a=a,b=b,score=score))
# test uniform with k-neighbors classifier
scores = uniform_testing(
can_X_train, can_y_train, can_X_test, can_y_test, 2,
KNeighborsClassifier, {'n_neighbors':3}, plot=True
)
print('\nUNIFORM WITH K-NEIGHBORS\n')
# print the best results
best_performance = np.argsort(scores[:,2])[::-1][:10]
print(f'The 10 best performing parameters are :')
print('\ta\tb\tscore')
print('\t'+'--'*len('\ta\tb\tscore'))
row_format = '\t{a:.2f}&\t{b:.2f}&\t{score:.3f}\\\\'
for best in best_performance:
a,b,score = scores[best]
print(row_format.format(a=a,b=b,score=score))
# print the worst results
worst_performance = np.argsort(scores[:,2])[::-1][-10:]
print(f'The 10 worst performing parameters are :')
print('\ta\tb\tscore')
print('\t'+'--'*len('\ta\tb\tscore'))
row_format = '\t{a:.2f}&\t{b:.2f}&\t{score:.3f}\\\\'
for worst in worst_performance:
a,b,score = scores[worst]
print(row_format.format(a=a,b=b,score=score))
# do gamma testing
def gamma_testing(X_train,y_train,X_test,y_test,n,Classifier,params,plot=True):
as_ = gamma_params['a']
scales = gamma_params['scale']
i=0
scores = np.zeros((len(as_)*len(scales),3))
for a in as_:
for scale in scales:
# first we apply noise
X_train_noise = noiser(X_train, 'gamma', (a,0,scale))
X_test_noise = noiser(X_test, 'gamma', (a,0,scale))
# next we apply pca
train_data = pca(X_train_noise)
test_data = pca(X_test_noise)
# finally we test and train a classifier and examine the
# score
classifier = Classifier(**params)
classifier.fit(train_data, y_train)
# score the classifier and record it
score = sum(classifier.predict(test_data) == y_test) / len(y_test)
scores[i] = (a,1/scale,score)
# every once and a while we plot the results
if i%25 == 0 and plot:
title_format = 'Gamma: a={a:.2f}, b={b:.2f}, score={score:.2f}'
title = title_format.format(a=a, b=1/scale, score=scores[i,2])
plot_data(train_data, y_train, n, title)
i+=1
return scores
# test gamma with random forest
# get the scores
scores = gamma_testing(
can_X_train, can_y_train, can_X_test, can_y_test, 2,
RandomForestClassifier, {'n_estimators':50, 'max_depth':10}
)
print('\nGAMMA WITH RANDOM FOREST\n')
# print the best results
best_performance = np.argsort(scores[:,2])[::-1][:10]
print(f'The 10 best performing parameters are :')
print('\ta\tb\tscore')
print('\t'+'--'*len('\tdf\tscore'))
row_format = '\t{a:.2f}&\t{b:.2f}&\t{score:.3f}\\\\'
for best in best_performance:
a,b,score = scores[best]
print(row_format.format(a=a,b=b,score=score))
# print the worst results
worst_performance = np.argsort(scores[:,2])[::-1][-10:]
print(f'The 10 worst performing parameters are :')
print('\ta\tb\tscore')
print('\t'+'--'*len('\tdf\tscore'))
row_format = '\t{a:.2f}&\t{b:.2f}&\t{score:.3f}\\\\'
for worst in worst_performance:
a,b,score = scores[worst]
print(row_format.format(a=a,b=b,score=score))
# test Gamma with k-neighbors classifier
scores = gamma_testing(
can_X_train, can_y_train, can_X_test, can_y_test, 2,
KNeighborsClassifier, {'n_neighbors':3}, plot=True
)
print('\nGAMMA WITH K-NEIGHBORS\n')
# print the best results
best_performance = np.argsort(scores[:,2])[::-1][:10]
print(f'The 10 best performing parameters are :')
print('\ta\tb\tscore')
print('\t'+'--'*len('\tdf\tscore'))
row_format = '\t{a:.2f}&\t{b:.2f}&\t{score:.3f}\\\\'
for best in best_performance:
a,b,score = scores[best]
print(row_format.format(a=a,b=b,score=score))
# print the worst results
worst_performance = np.argsort(scores[:,2])[::-1][-10:]
print(f'The 10 worst performing parameters are :')
print('\ta\tb\tscore')
print('\t'+'--'*len('\tdf\tscore'))
row_format = '\t{a:.2f}&\t{b:.2f}&\t{score:.3f}\\\\'
for worst in worst_performance:
a,b,score = scores[worst]
print(row_format.format(a=a,b=b,score=score))